{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing: Clean Up & Tokenize Questions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Break question titles into tokens, and perform token-level normalization: expand shortened words, correct spelling, etc."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Original question datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('none')\n",
    "df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('none')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "df_all = pd.concat([df_train, df_test])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Stopwords customized for Quora dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords = set(kg.io.load_lines(project.aux_dir + 'stopwords.vocab'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pre-composed spelling correction dictionary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "spelling_corrections = kg.io.load_json(project.aux_dir + 'spelling_corrections.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = nltk.tokenize.RegexpTokenizer(r'\\w+')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess and tokenize questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def translate(text, translation):\n",
    "    for token, replacement in translation.items():\n",
    "        text = text.replace(token, ' ' + replacement + ' ')\n",
    "    text = text.replace('  ', ' ')\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def spell_digits(text):\n",
    "    translation = {\n",
    "        '0': 'zero',\n",
    "        '1': 'one',\n",
    "        '2': 'two',\n",
    "        '3': 'three',\n",
    "        '4': 'four',\n",
    "        '5': 'five',\n",
    "        '6': 'six',\n",
    "        '7': 'seven',\n",
    "        '8': 'eight',\n",
    "        '9': 'nine',\n",
    "    }\n",
    "    return translate(text, translation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def expand_negations(text):\n",
    "    translation = {\n",
    "        \"can't\": 'can not',\n",
    "        \"won't\": 'would not',\n",
    "        \"shan't\": 'shall not',\n",
    "    }\n",
    "    text = translate(text, translation)\n",
    "    return text.replace(\"n't\", \" not\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def correct_spelling(text):\n",
    "    return ' '.join(\n",
    "        spelling_corrections.get(token, token)\n",
    "        for token in tokenizer.tokenize(text)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_question_tokens(question, lowercase=True, spellcheck=True, remove_stopwords=True):\n",
    "    if lowercase:\n",
    "        question = question.lower()\n",
    "    \n",
    "    if spellcheck:\n",
    "        question = correct_spelling(question)\n",
    "    \n",
    "    question = spell_digits(question)\n",
    "    question = expand_negations(question)\n",
    "\n",
    "    tokens = [token for token in tokenizer.tokenize(question.lower() if lowercase else question)]    \n",
    "    if remove_stopwords:\n",
    "        tokens = [token for token in tokens if token not in stopwords]\n",
    "    \n",
    "    tokens.append('.')\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_question_pair_tokens_spellcheck(pair):\n",
    "    return [\n",
    "        get_question_tokens(pair[0], lowercase=False, spellcheck=True, remove_stopwords=False),\n",
    "        get_question_tokens(pair[1], lowercase=False, spellcheck=True, remove_stopwords=False),\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_question_pair_tokens_lowercase_spellcheck(pair):\n",
    "    return [\n",
    "        get_question_tokens(pair[0], lowercase=True, spellcheck=True, remove_stopwords=False),\n",
    "        get_question_tokens(pair[1], lowercase=True, spellcheck=True, remove_stopwords=False),\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_question_pair_tokens_lowercase_spellcheck_remove_stopwords(pair):\n",
    "    return [\n",
    "        get_question_tokens(pair[0], lowercase=True, spellcheck=True, remove_stopwords=True),\n",
    "        get_question_tokens(pair[1], lowercase=True, spellcheck=True, remove_stopwords=True),\n",
    "    ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tokenize the questions, correct spelling, but keep the upper/lower case."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2751/2751 [10:19<00:00,  4.44it/s]\n"
     ]
    }
   ],
   "source": [
    "tokens_spellcheck = kg.jobs.map_batch_parallel(\n",
    "    df_all.as_matrix(columns=['question1', 'question2']),\n",
    "    item_mapper=get_question_pair_tokens_spellcheck,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tokenize the questions, convert to lowercase and correct spelling, keep the stopwords (useful for neural models)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2751/2751 [14:28<00:00,  3.17it/s]\n"
     ]
    }
   ],
   "source": [
    "tokens_lowercase_spellcheck = kg.jobs.map_batch_parallel(\n",
    "    df_all.as_matrix(columns=['question1', 'question2']),\n",
    "    item_mapper=get_question_pair_tokens_lowercase_spellcheck,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just as before, but also with stopwords removed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2751/2751 [14:03<00:00,  3.26it/s]\n"
     ]
    }
   ],
   "source": [
    "tokens_lowercase_spellcheck_no_stopwords = kg.jobs.map_batch_parallel(\n",
    "    df_all.as_matrix(columns=['question1', 'question2']),\n",
    "    item_mapper=get_question_pair_tokens_lowercase_spellcheck_remove_stopwords,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract question vocabulary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5500172/5500172 [00:48<00:00, 114342.48it/s]\n"
     ]
    }
   ],
   "source": [
    "vocab = set()\n",
    "for question in progressbar(np.array(tokens_lowercase_spellcheck).ravel()):\n",
    "    for token in question:\n",
    "        vocab.add(token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_no_stopwords = vocab - stopwords"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save preprocessed data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tokenized questions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists(project.preprocessed_data_dir):\n",
    "    os.makedirs(project.preprocessed_data_dir)\n",
    "if not os.path.exists(project.features_dir):\n",
    "    os.makedirs(project.features_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg.io.save(\n",
    "    tokens_spellcheck[:len(df_train)],\n",
    "    project.preprocessed_data_dir + 'tokens_spellcheck_train.pickle'\n",
    ")\n",
    "kg.io.save(\n",
    "    tokens_spellcheck[len(df_train):],\n",
    "    project.preprocessed_data_dir + 'tokens_spellcheck_test.pickle'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg.io.save(\n",
    "    tokens_lowercase_spellcheck[:len(df_train)],\n",
    "    project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_train.pickle'\n",
    ")\n",
    "kg.io.save(\n",
    "    tokens_lowercase_spellcheck[len(df_train):],\n",
    "    project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_test.pickle'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg.io.save(\n",
    "    tokens_lowercase_spellcheck_no_stopwords[:len(df_train)],\n",
    "    project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle'\n",
    ")\n",
    "kg.io.save(\n",
    "    tokens_lowercase_spellcheck_no_stopwords[len(df_train):],\n",
    "    project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Question vocabulary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg.io.save_lines(\n",
    "    sorted(list(vocab)),\n",
    "    project.preprocessed_data_dir + 'tokens_lowercase_spellcheck.vocab'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg.io.save_lines(\n",
    "    sorted(list(vocab_no_stopwords)),\n",
    "    project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords.vocab'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ground truth."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "kg.io.save(df_train['is_duplicate'].values, project.features_dir + 'y_train.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
